﻿//  
//  Article: Parsing HTML Tags in C#
//  Source: Code Project
//  http://www.codeproject.com/Articles/57176/Parsing-HTML-Tags-in-C
//

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;

namespace NZBMatrixAdvancedFeedReader.Classes
{
    public class HTMLHelper
    {

        public class HtmlTag
        {
            /// <summary>
            /// Name of this tag
            /// </summary>
            public string Name { get; set; }

            /// <summary>
            /// Collection of attribute names and values for this tag
            /// </summary>
            public Dictionary<string, string> Attributes { get; set; }

            /// <summary>
            /// True if this tag contained a trailing forward slash
            /// </summary>
            public bool TrailingSlash { get; set; }
        };

        public class HtmlParser
        {

            protected string _html;
            protected int _pos;
            protected bool _scriptBegin;

            public HtmlParser(string html)
            {
                Reset(html);
            }

            /// <summary>
            /// Resets the current position to the start of the current document
            /// </summary>
            public void Reset()
            {
                _pos = 0;
            }

            /// <summary>
            /// Sets the current document and resets the current position to the
            /// start of it
            /// </summary>
            /// <param name="html"></param>
            public void Reset(string html)
            {
                _html = html;
                _pos = 0;
            }

            /// <summary>
            /// Indicates if the current position is at the end of the current
            /// document
            /// </summary>
            public bool EOF
            {
                get { return (_pos >= _html.Length); }
            }

            /// <summary>
            /// Parses the next tag that matches the specified tag name
            /// </summary>
            /// <param name="name">Name of the tags to parse ("*" = parse all
            /// tags)</param>
            /// <param name="tag">Returns information on the next occurrence
            /// of the specified tag or null if none found</param>
            /// <returns>True if a tag was parsed or false if the end of the
            /// document was reached</returns>
            public bool ParseNext(string name, out HtmlTag tag)
            {
                tag = null;

                // Nothing to do if no tag specified
                if (String.IsNullOrEmpty(name))
                    return false;

                // Loop until match is found or there are no more tags
                while (MoveToNextTag())
                {
                    // Skip opening '<'
                    Move();

                    // Examine first tag character
                    char c = Peek();
                    if (c == '!' && Peek(1) == '-' && Peek(2) == '-')
                    {
                        // Skip over comments
                        const string endComment = "-->";
                        _pos = _html.IndexOf(endComment, _pos);
                        NormalizePosition();
                        Move(endComment.Length);
                    }
                    else if (c == '/')
                    {
                        // Skip over closing tags
                        _pos = _html.IndexOf('>', _pos);
                        NormalizePosition();
                        Move();
                    }
                    else
                    {
                        // Parse tag
                        bool result = ParseTag(name, ref tag);

                        // Because scripts may contain tag characters,
                        // we need special handling to skip over
                        // script contents
                        if (_scriptBegin)
                        {
                            const string endScript = "</script";
                            _pos = _html.IndexOf(endScript, _pos,
                              StringComparison.OrdinalIgnoreCase);
                            NormalizePosition();
                            Move(endScript.Length);
                            SkipWhitespace();
                            if (Peek() == '>')
                                Move();
                        }

                        // Return true if requested tag was found
                        if (result)
                            return true;
                    }
                }
                return false;
            }

            /// <summary>
            /// Parses the contents of an HTML tag. The current position should
            /// be at the first character following the tag's opening less-than
            /// character.
            /// 
            /// Note: We parse to the end of the tag even if this tag was not
            /// requested by the caller. This ensures subsequent parsing takes
            /// place after this tag
            /// </summary>
            /// <param name="name">Name of the tag the caller is requesting,
            /// or "*" if caller is requesting all tags</param>
            /// <param name="tag">Returns information on this tag if it's one
            /// the caller is requesting</param>
            /// <returns>True if data is being returned for a tag requested by
            /// the caller or false otherwise</returns>

            protected bool ParseTag(string name, ref HtmlTag tag)
            {
                // Get name of this tag
                string s = ParseTagName();

                // Special handling
                bool doctype = _scriptBegin = false;
                if (String.Compare(s, "!DOCTYPE", true) == 0)
                    doctype = true;
                else if (String.Compare(s, "script", true) == 0)
                    _scriptBegin = true;

                // Is this a tag requested by caller?
                bool requested = false;
                if (name == "*" || String.Compare(s, name, true) == 0)
                {
                    // Yes, create new tag object
                    tag = new HtmlTag();
                    tag.Name = s;
                    tag.Attributes = new Dictionary<string, string>();
                    requested = true;
                }

                // Parse attributes
                SkipWhitespace();
                while (Peek() != '>')
                {
                    if (Peek() == '/')
                    {
                        // Handle trailing forward slash
                        if (requested)
                            tag.TrailingSlash = true;
                        Move();
                        SkipWhitespace();
                        // If this is a script tag, it was closed
                        _scriptBegin = false;
                    }
                    else
                    {
                        // Parse attribute name
                        s = (!doctype) ? ParseAttributeName() : ParseAttributeValue();
                        SkipWhitespace();
                        // Parse attribute value
                        string value = String.Empty;
                        if (Peek() == '=')
                        {
                            Move();
                            SkipWhitespace();
                            value = ParseAttributeValue();
                            SkipWhitespace();
                        }
                        // Add attribute to collection if requested tag
                        if (requested)
                        {
                            // This tag replaces existing tags with same name
                            if (tag.Attributes.Keys.Contains(s))
                                tag.Attributes.Remove(s);
                            tag.Attributes.Add(s, value);
                        }
                    }
                }
                // Skip over closing '>'
                Move();

                return requested;
            }

            /// <summary>
            /// Parses a tag name. The current position should be the first
            /// character of the name
            /// </summary>
            /// <returns>Returns the parsed name string</returns>
            protected string ParseTagName()
            {
                int start = _pos;
                while (!EOF && !Char.IsWhiteSpace(Peek()) && Peek() != '>')
                    Move();
                return _html.Substring(start, _pos - start);
            }

            /// <summary>
            /// Parses an attribute name. The current position should be the
            /// first character of the name
            /// </summary>
            /// <returns>Returns the parsed name string</returns>
            protected string ParseAttributeName()
            {
                int start = _pos;
                while (!EOF && !Char.IsWhiteSpace(Peek()) && Peek() != '>'
                  && Peek() != '=')
                    Move();
                return _html.Substring(start, _pos - start);
            }

            /// <summary>
            /// Parses an attribute value. The current position should be the
            /// first non-whitespace character following the equal sign.
            /// 
            /// Note: We terminate the name or value if we encounter a new line.
            /// This seems to be the best way of handling errors such as values
            /// missing closing quotes, etc.
            /// </summary>
            /// <returns>Returns the parsed value string</returns>
            protected string ParseAttributeValue()
            {
                int start, end;
                char c = Peek();
                if (c == '"' || c == '\'')
                {
                    // Move past opening quote
                    Move();
                    // Parse quoted value
                    start = _pos;
                    _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, start);
                    NormalizePosition();
                    end = _pos;
                    // Move past closing quote
                    if (Peek() == c)
                        Move();
                }
                else
                {
                    // Parse unquoted value
                    start = _pos;
                    while (!EOF && !Char.IsWhiteSpace(c) && c != '>')
                    {
                        Move();
                        c = Peek();
                    }
                    end = _pos;
                }
                return _html.Substring(start, end - start);
            }

            /// <summary>
            /// Moves to the start of the next tag
            /// </summary>
            /// <returns>True if another tag was found, false otherwise</returns>

            protected bool MoveToNextTag()
            {
                _pos = _html.IndexOf('<', _pos);
                NormalizePosition();
                return !EOF;
            }

            /// <summary>
            /// Returns the character at the current position, or a null
            /// character if we're at the end of the document
            /// </summary>
            /// <returns>The character at the current position</returns>
            public char Peek()
            {
                return Peek(0);
            }

            /// <summary>
            /// Returns the character at the specified number of characters
            /// beyond the current position, or a null character if the
            /// specified position is at the end of the document
            /// </summary>
            /// <param name="ahead">The number of characters beyond the
            /// current position</param>
            /// <returns>The character at the specified position</returns>
            public char Peek(int ahead)
            {
                int pos = (_pos + ahead);
                if (pos < _html.Length)
                    return _html[pos];
                return (char)0;
            }

            /// <summary>
            /// Moves the current position ahead one character
            /// </summary>
            protected void Move()
            {
                Move(1);
            }

            /// <summary>
            /// Moves the current position ahead the specified number of characters
            /// </summary>
            /// <param name="ahead">The number of characters to move ahead</param>
            protected void Move(int ahead)
            {
                _pos = Math.Min(_pos + ahead, _html.Length);
            }

            /// <summary>
            /// Moves the current position to the next character that is
            // not whitespace
            /// </summary>
            protected void SkipWhitespace()
            {
                while (!EOF && Char.IsWhiteSpace(Peek()))
                    Move();
            }

            /// <summary>
            /// Normalizes the current position. This is primarily for handling
            /// conditions where IndexOf(), etc. return negative values when
            /// the item being sought was not found
            /// </summary>
            protected void NormalizePosition()
            {
                if (_pos < 0)
                    _pos = _html.Length;
            }
        }

        protected void ScanLinks(string url)
        {
            // Download page
            WebClient client = new WebClient();
            string html = client.DownloadString(url);

            // Scan links on this page
            HtmlTag tag;
            HtmlParser parse = new HtmlParser(html);
            while (parse.ParseNext("a", out tag))
            {
                // See if this anchor links to us
                string value;
                if (tag.Attributes.TryGetValue("href", out value))
                {
                    // value contains URL referenced by this link
                }
            }
        }

        
    }


}
